package com.aboat;

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


import java.io.File;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 抓取图片
 *
 * @author xyj
 * @email clear_windy@yeah.net
 * @create 2017-03-04 22:39
 */
public class ImageCrawler extends BreadthCrawler {
    //AtomicInteger imageId = new AtomicInteger(0);

    //原子性int，用于生成图片文件名
    AtomicInteger imageId;

    //用于保存图片的文件夹
    File downloadDir;

    public ImageCrawler(String crawlPath, String downloadPath) {
        super(crawlPath, true);
        downloadDir = new File(downloadPath);
        if(!downloadDir.exists()){
            downloadDir.mkdirs();
        }
        computeImageId();
    }

    public void visit(Page page, Links links) {
        //根据http头中的Content-Type信息来判断当前资源是网页还是图片
        String contentType = page.getResponse().getContentType();
        if(contentType==null){
            return;
        }else if (contentType.contains("html")) {
            //如果是网页，则抽取其中包含图片的URL，放入后续任务
            Elements imgs = page.getDoc().select("img[src]");
            for (Element img : imgs) {
                String imgSrc = img.attr("abs:src");
                if(imgSrc.indexOf("thumb")<0) {
                    links.add(imgSrc);
                }
            }
        } else if (contentType.startsWith("image")) {
            //如果是图片，直接下载
            String extensionName=contentType.split("/")[1];
            String imageFileName=imageId.incrementAndGet()+"."+extensionName;
            File imageFile=new File(downloadDir,imageFileName);
            try {
                FileUtils.writeFile(imageFile, page.getContent());
                System.out.println("保存图片 "+page.getUrl()+" 到 "+imageFile.getAbsolutePath());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }
    }

    public void computeImageId(){
        int maxId=-1;
        for(File imageFile:downloadDir.listFiles()){
            String fileName=imageFile.getName();
            String idStr=fileName.split("\\.")[0];
            int id=Integer.valueOf(idStr);
            if(id>maxId){
                maxId=id;
            }
        }
        imageId=new AtomicInteger(maxId);
    }

    public static void main(String[] args) throws Exception {
        ImageCrawler crawler = new ImageCrawler("D:\\spider\\log", "D:\\spider\\picture");//日志路径，抓取内容路径
        try {
            crawler.addSeed("http://www.meitulu.com/");//抓取的网站
            String strReg = "http://www.meitulu.com/item/.*";//抓取的深度
            crawler.addRegex(strReg);
            crawler.setThreads(16);
            crawler.start(8);
        }
        catch(java.io.IOException e){
        }
    }


}
